In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from lime.lime_tabular import LimeTabularExplainer

from xai.data.reader import read_data
from xai.models import LightGBMModel, LogisticRegressionModel
from xai.validation import HoldOutValidation

1)

In [2]:
features, target = read_data('data/hotel_bookings.csv')
In [3]:
features.head()
Out[3]:
hotel lead_time arrival_date_year arrival_date_month arrival_date_week_number arrival_date_day_of_month stays_in_weekend_nights stays_in_week_nights adults children ... assigned_room_type booking_changes deposit_type agent days_in_waiting_list customer_type adr required_car_parking_spaces total_of_special_requests reservation_status_date
0 Resort Hotel 342 2015 7 27 1 0 0 2 0.0 ... C 3 No Deposit NaN 0 Transient 0.0 0 0 2015-07-01
1 Resort Hotel 737 2015 7 27 1 0 0 2 0.0 ... C 4 No Deposit NaN 0 Transient 0.0 0 0 2015-07-01
2 Resort Hotel 7 2015 7 27 1 0 1 1 0.0 ... C 0 No Deposit NaN 0 Transient 75.0 0 0 2015-07-02
3 Resort Hotel 13 2015 7 27 1 0 1 1 0.0 ... A 0 No Deposit 304.0 0 Transient 75.0 0 0 2015-07-02
4 Resort Hotel 14 2015 7 27 1 0 2 2 0.0 ... A 0 No Deposit 240.0 0 Transient 98.0 0 1 2015-07-03

5 rows × 29 columns

In [4]:
validation = HoldOutValidation(test_size=0.1, random_state=42)

(X_train, y_train), (X_test, y_test) = next(validation.split(features, target))
In [5]:
model = LightGBMModel(n_estimators=200, 
                      learning_rate=0.007, 
                      max_depth=-1, 
                      num_leaves=64,
                      n_jobs=4,
                      random_state=42)
In [6]:
model = model.fit(X_train, y_train)
In [7]:
y_pred_train = model.predict(X_train)
accuracy = np.mean(y_pred_train == y_train)
print(f'Train accuracy = {accuracy}')
Train accuracy = 0.8629607914305125
In [8]:
y_pred_test = model.predict(X_test)
accuracy = np.mean(y_pred_test == y_test)
print(f'Test accuracy = {accuracy}')
Test accuracy = 0.8622162660189295

2)

In [9]:
print(f'Predykcja modelu:\t{model.predict(X_train.iloc[0:1]).values[0]}')
print(f'Prawdziwa wartość:\t{y_train[0]}')
Predykcja modelu:	0
Prawdziwa wartość:	0

3)

In [10]:
X_train_transform = model._feature_engineering(X_train, train=False)
feature_names = list(X_train_transform.columns)
categorical_features = [feature_id for feature_id, feature in enumerate(feature_names) 
                     if X_train_transform[feature].dtype.name=='category']
categorical_names = {}

for feature in categorical_features:
    categorical_map = dict(zip(
        X_train_transform.iloc[:,feature].cat.codes + 1,
        X_train_transform.iloc[:,feature]
    ))
    categorical_names[feature] = categorical_map
    X_train_transform.iloc[:, feature] = X_train_transform.iloc[:, feature].cat.codes + 1
/Users/Neyo/python3.7/lib/python3.7/site-packages/pandas/core/indexing.py:966: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
In [11]:
explainer = LimeTabularExplainer(X_train_transform.values,
                                 training_labels=y_train.values,
                                 class_names=['Accepted', 'Canceled'],
                                 mode='classification',
                                 verbose=True,
                                 discretize_continuous=False,
                                 feature_names=feature_names,
                                 categorical_names=categorical_names,
                                 categorical_features=categorical_features
                                )
In [12]:
def predict_fn(data):
    data = data.copy()
    data = pd.DataFrame(data, columns=feature_names)
    for feature in categorical_features:
        data.iloc[:, feature] = data.iloc[:, feature].round().astype(int).apply(
            lambda name: categorical_names[feature][name]).astype('category')
    _model = model.model
    return _model.predict_proba(data)
In [13]:
print(f'Predykcja modelu:\t{model.predict(X_train.iloc[0:1]).values[0]}')
print(f'Prawdziwa wartość:\t{y_train.iloc[0]}')

exp = explainer.explain_instance(X_train_transform.iloc[0],
                                 predict_fn=predict_fn,
                                 num_features=5)
exp.show_in_notebook(show_table=True, show_all=False)
Predykcja modelu:	0
Prawdziwa wartość:	0
Intercept 0.811336348436762
Prediction_local [0.19827706]
Right: 0.29308023289591895

4)

In [14]:
for i in range(4, 10):
    print(f'Prawdziwa wartość:\t{y_train[i]}')
    print(f'Predykcja modelu:\t{model.predict(X_train.iloc[i:(i+1)]).values[0]}')
    exp = explainer.explain_instance(X_train_transform.iloc[i],
                                     predict_fn=predict_fn,
                                     num_features=5)
    exp.show_in_notebook(show_table=True, show_all=False)
Prawdziwa wartość:	0
Predykcja modelu:	0
Intercept 0.7494293189275245
Prediction_local [0.33250648]
Right: 0.22193669595945928
Prawdziwa wartość:	0
Predykcja modelu:	0
Intercept 0.8182528117029779
Prediction_local [-0.088693]
Right: 0.09014032737470684
Prawdziwa wartość:	0
Predykcja modelu:	1
Intercept 0.8059573208572397
Prediction_local [0.2545175]
Right: 0.6744856704709893
Prawdziwa wartość:	0
Predykcja modelu:	1
Intercept 0.8101567682265719
Prediction_local [0.20235864]
Right: 0.6197263071771142
Prawdziwa wartość:	1
Predykcja modelu:	1
Intercept 0.8088514279434375
Prediction_local [0.24320354]
Right: 0.5024079255000273
Prawdziwa wartość:	1
Predykcja modelu:	1
Intercept 0.7577030934452211
Prediction_local [0.26141604]
Right: 0.5083345647370692

Jak możemy zauważyć na powyższych przykładach, wyjaśnienia LIME'a jednomyślnie stwierdzają, że najważniejszymi zmiennymi są czynniki, czy osoba zapłaciła przedwcześnie za hotel, lub czy wymaga miejsca parkingowego. Ponadto osoby, które nigdy wcześniej nie odwołały rezerwacji mają mniejszą szansę na, aby w obecnej sytuacji też tego nie zrobiły.

In [15]:
model = LogisticRegressionModel()
In [16]:
model = model.fit(X_train, y_train)
In [17]:
y_pred_train = model.predict(X_train)
accuracy = np.mean(y_pred_train == y_train)
print(f'Train accuracy = {accuracy}')
Train accuracy = 0.8067491228559995
In [18]:
y_pred_test = model.predict(X_test)
accuracy = np.mean(y_pred_test == y_test)
print(f'Test accuracy = {accuracy}')
Test accuracy = 0.8096993047993969
In [19]:
X_train_transform = model._feature_engineering(X_train, train=False)
feature_names = list(X_train_transform.columns)
categorical_features = [feature_id for feature_id, feature in enumerate(feature_names) 
                     if X_train_transform[feature].dtype.name=='category']
categorical_names = {}

for feature in categorical_features:
    categorical_map = dict(zip(
        X_train_transform.iloc[:,feature].cat.codes + 1,
        X_train_transform.iloc[:,feature]
    ))
    categorical_names[feature] = categorical_map
    X_train_transform.iloc[:, feature] = X_train_transform.iloc[:, feature].cat.codes + 1
In [20]:
explainer = LimeTabularExplainer(X_train_transform.values,
                                 training_labels=y_train.values,
                                 class_names=['Accepted', 'Canceled'],
                                 mode='classification',
                                 verbose=True,
                                 discretize_continuous=False,
                                 feature_names=feature_names,
                                 categorical_names=categorical_names,
                                 categorical_features=categorical_features
                                )
In [21]:
def predict_fn(data):
    data = data.copy()
    data = pd.DataFrame(data, columns=feature_names)
    for feature in categorical_features:
        data.iloc[:, feature] = data.iloc[:, feature].round().astype(int).apply(
            lambda name: categorical_names[feature][name]).astype('category')
    _model = model.model
    return _model.predict_proba(data)
In [22]:
for i in range(10):
    print(f'Prawdziwa wartość:\t{y_train[i]}')
    print(f'Predykcja modelu:\t{model.predict(X_train.iloc[i:(i+1)]).values[0]}')
    exp = explainer.explain_instance(X_train_transform.iloc[i],
                                     predict_fn=predict_fn,
                                     num_features=5)
    exp.show_in_notebook(show_table=True, show_all=False)
Prawdziwa wartość:	0
Predykcja modelu:	0
Intercept 0.4026041904602379
Prediction_local [0.40427269]
Right: 0.2816606339191864
Prawdziwa wartość:	0
Predykcja modelu:	0
Intercept 0.39806772213203334
Prediction_local [0.3286629]
Right: 0.4137109807061423
Prawdziwa wartość:	0
Predykcja modelu:	0
Intercept 0.3948713901098631
Prediction_local [0.39053072]
Right: 0.1137171107062349
Prawdziwa wartość:	0
Predykcja modelu:	0
Intercept 0.4015572119814984
Prediction_local [0.42320149]
Right: 0.36707567908179767
Prawdziwa wartość:	0
Predykcja modelu:	0
Intercept 0.3990774081675863
Prediction_local [0.44340894]
Right: 0.5215649616105716
Prawdziwa wartość:	0
Predykcja modelu:	0
Intercept 0.3941357683053046
Prediction_local [-0.72990649]
Right: 8.813737228725012e-10
Prawdziwa wartość:	0
Predykcja modelu:	0
Intercept 0.403025468662249
Prediction_local [0.3995398]
Right: 0.5685331656776622
Prawdziwa wartość:	0
Predykcja modelu:	0
Intercept 0.3983694485574779
Prediction_local [0.41850369]
Right: 0.24593659559677042
Prawdziwa wartość:	1
Predykcja modelu:	0
Intercept 0.39997983364783146
Prediction_local [0.5126516]
Right: 0.7191222889295492
Prawdziwa wartość:	1
Predykcja modelu:	0
Intercept 0.3980235186310762
Prediction_local [0.50838391]
Right: 0.6819902083718002

W przypadku regresji logistycznej zauważamy, że najważniejsze zmienne pozostały takie same, jednak wymagania parkingowe mają zdecydowanie większą wagę oraz feature encoding zmiennej kategorycznej country ma większą wagę, niż to było w poprzednim modelu, zapewnie z powodu użycia encodingu zmiennej o dużej liczbie wartości.